R Tutorial
An introduction to R
Introduction
This tutorial is will introduce the reader to , a free, open-source statistical computing environment often used with RStudio, a integrated development environment for .
R Project Logo
Download
- Download at https://www.r-project.org/
- Download
RStudioat https://rstudio.com/products/rstudio/download/
Calculator
can be used as a super awesome calculator
# 5 + 3 = 8
5 + 3 ## [1] 8
# 24 / (1 + 2) = 8
24 / (1 + 2) ## [1] 8
# 2 * 2 * 2 = 8
2^3 ## [1] 8
# 8 * 8 = 64
sqrt(64) ## [1] 8
# -log10(0.05 / 5000000) = 8
-log10(0.05 / 5000000) ## [1] 8
Functions
has many useful built in functions
1:10## [1] 1 2 3 4 5 6 7 8 9 10
as.character(1:10)## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10"
rep(1:2, times = 5)## [1] 1 2 1 2 1 2 1 2 1 2
rep(1:5, times = 2)## [1] 1 2 3 4 5 1 2 3 4 5
rep(1:5, each = 2)## [1] 1 1 2 2 3 3 4 4 5 5
rep(1:5, length.out = 7)## [1] 1 2 3 4 5 1 2
seq(5, 50, by = 5)## [1] 5 10 15 20 25 30 35 40 45 50
seq(5, 50, length.out = 5)## [1] 5.00 16.25 27.50 38.75 50.00
paste(1:10, 20:30, sep = "-")## [1] "1-20" "2-21" "3-22" "4-23" "5-24" "6-25" "7-26" "8-27" "9-28" "10-29" "1-30"
paste(1:10, collapse = "-")## [1] "1-2-3-4-5-6-7-8-9-10"
paste0("x", 1:10)## [1] "x1" "x2" "x3" "x4" "x5" "x6" "x7" "x8" "x9" "x10"
min(1:10)## [1] 1
max(1:10)## [1] 10
range(1:10)## [1] 1 10
mean(1:10)## [1] 5.5
sd(1:10)## [1] 3.02765
Custom Functions
Users can also create their own functions
customFunction1 <- function(x, y) {
z <- 100 * x / (x + y)
paste(z, "%")
}
customFunction1(x = 10, y = 90)## [1] "10 %"
customFunction2 <- function(x) {
mymin <- mean(x - sd(x))
mymax <- mean(x) + sd(x)
print(paste("Min =", mymin))
print(paste("Max =", mymax))
}
customFunction2(x = 1:10)## [1] "Min = 2.47234964590251"
## [1] "Max = 8.52765035409749"
for loops and if else
statements
xx <- NULL #creates and empty object
for(i in 1:10) {
xx[i] <- i*3
}
xx## [1] 3 6 9 12 15 18 21 24 27 30
xx %% 2 #gives the remainder when divided by 2## [1] 1 0 1 0 1 0 1 0 1 0
for(i in 1:length(xx)) {
if((xx[i] %% 2) == 0) {
print(paste(xx[i],"is Even"))
} else {
print(paste(xx[i],"is Odd"))
}
}## [1] "3 is Odd"
## [1] "6 is Even"
## [1] "9 is Odd"
## [1] "12 is Even"
## [1] "15 is Odd"
## [1] "18 is Even"
## [1] "21 is Odd"
## [1] "24 is Even"
## [1] "27 is Odd"
## [1] "30 is Even"
# or
ifelse(xx %% 2 == 0, "Even", "Odd")## [1] "Odd" "Even" "Odd" "Even" "Odd" "Even" "Odd" "Even" "Odd" "Even"
paste(xx, ifelse(xx %% 2 == 0, "is Even", "is Odd"))## [1] "3 is Odd" "6 is Even" "9 is Odd" "12 is Even" "15 is Odd" "18 is Even" "21 is Odd" "24 is Even" "27 is Odd"
## [10] "30 is Even"
Objects
Information can be stored in user defined objects, in multiple forms:
c(): a string of valuesmatrix(): a two dimensional matrix in one formatdata.frame(): a two dimensional matrix where each column can be a different formatlist():
A string…
xc <- 1:10
xc## [1] 1 2 3 4 5 6 7 8 9 10
xc <- c(1,2,3,4,5,6,7,8,9,10)
xc## [1] 1 2 3 4 5 6 7 8 9 10
A matrix…
xm <- matrix(1:100, nrow = 10, ncol = 10, byrow = T)
xm## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] 1 2 3 4 5 6 7 8 9 10
## [2,] 11 12 13 14 15 16 17 18 19 20
## [3,] 21 22 23 24 25 26 27 28 29 30
## [4,] 31 32 33 34 35 36 37 38 39 40
## [5,] 41 42 43 44 45 46 47 48 49 50
## [6,] 51 52 53 54 55 56 57 58 59 60
## [7,] 61 62 63 64 65 66 67 68 69 70
## [8,] 71 72 73 74 75 76 77 78 79 80
## [9,] 81 82 83 84 85 86 87 88 89 90
## [10,] 91 92 93 94 95 96 97 98 99 100
xm <- matrix(1:100, nrow = 10, ncol = 10, byrow = F)
xm## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] 1 11 21 31 41 51 61 71 81 91
## [2,] 2 12 22 32 42 52 62 72 82 92
## [3,] 3 13 23 33 43 53 63 73 83 93
## [4,] 4 14 24 34 44 54 64 74 84 94
## [5,] 5 15 25 35 45 55 65 75 85 95
## [6,] 6 16 26 36 46 56 66 76 86 96
## [7,] 7 17 27 37 47 57 67 77 87 97
## [8,] 8 18 28 38 48 58 68 78 88 98
## [9,] 9 19 29 39 49 59 69 79 89 99
## [10,] 10 20 30 40 50 60 70 80 90 100
A data frame…
xd <- data.frame(
x1 = c("aa","bb","cc","dd","ee",
"ff","gg","hh","ii","jj"),
x2 = 1:10,
x3 = c(1,1,1,1,1,2,2,2,3,3),
x4 = rep(c(1,2), times = 5),
x5 = rep(1:5, times = 2),
x6 = rep(1:5, each = 2),
x7 = seq(5, 50, by = 5),
x8 = log10(1:10),
x9 = (1:10)^3,
x10 = c(T,T,T,F,F,T,T,F,F,F)
)
xd## x1 x2 x3 x4 x5 x6 x7 x8 x9 x10
## 1 aa 1 1 1 1 1 5 0.0000000 1 TRUE
## 2 bb 2 1 2 2 1 10 0.3010300 8 TRUE
## 3 cc 3 1 1 3 2 15 0.4771213 27 TRUE
## 4 dd 4 1 2 4 2 20 0.6020600 64 FALSE
## 5 ee 5 1 1 5 3 25 0.6989700 125 FALSE
## 6 ff 6 2 2 1 3 30 0.7781513 216 TRUE
## 7 gg 7 2 1 2 4 35 0.8450980 343 TRUE
## 8 hh 8 2 2 3 4 40 0.9030900 512 FALSE
## 9 ii 9 3 1 4 5 45 0.9542425 729 FALSE
## 10 jj 10 3 2 5 5 50 1.0000000 1000 FALSE
A list…
xl <- list(xc, xm, xd)
xl[[1]]## [1] 1 2 3 4 5 6 7 8 9 10
xl[[2]]## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] 1 11 21 31 41 51 61 71 81 91
## [2,] 2 12 22 32 42 52 62 72 82 92
## [3,] 3 13 23 33 43 53 63 73 83 93
## [4,] 4 14 24 34 44 54 64 74 84 94
## [5,] 5 15 25 35 45 55 65 75 85 95
## [6,] 6 16 26 36 46 56 66 76 86 96
## [7,] 7 17 27 37 47 57 67 77 87 97
## [8,] 8 18 28 38 48 58 68 78 88 98
## [9,] 9 19 29 39 49 59 69 79 89 99
## [10,] 10 20 30 40 50 60 70 80 90 100
xl[[3]]## x1 x2 x3 x4 x5 x6 x7 x8 x9 x10
## 1 aa 1 1 1 1 1 5 0.0000000 1 TRUE
## 2 bb 2 1 2 2 1 10 0.3010300 8 TRUE
## 3 cc 3 1 1 3 2 15 0.4771213 27 TRUE
## 4 dd 4 1 2 4 2 20 0.6020600 64 FALSE
## 5 ee 5 1 1 5 3 25 0.6989700 125 FALSE
## 6 ff 6 2 2 1 3 30 0.7781513 216 TRUE
## 7 gg 7 2 1 2 4 35 0.8450980 343 TRUE
## 8 hh 8 2 2 3 4 40 0.9030900 512 FALSE
## 9 ii 9 3 1 4 5 45 0.9542425 729 FALSE
## 10 jj 10 3 2 5 5 50 1.0000000 1000 FALSE
Selecting Data
xc[5] # 5th element in xc## [1] 5
xd$x3[5] # 5th element in col "x3"## [1] 1
xd[5,"x3"] # row 5, col "x3"## [1] 1
xd$x3 # all of col "x3"## [1] 1 1 1 1 1 2 2 2 3 3
xd[,"x3"] # all rows, col "x3"## [1] 1 1 1 1 1 2 2 2 3 3
xd[3,] # row 3, all cols## x1 x2 x3 x4 x5 x6 x7 x8 x9 x10
## 3 cc 3 1 1 3 2 15 0.4771213 27 TRUE
xd[c(2,4),c("x4","x5")] # rows 2 & 4, cols "x4" & "x5"## x4 x5
## 2 2 2
## 4 2 4
xl[[3]]$x1 # 3rd object in the list, col "x1## [1] "aa" "bb" "cc" "dd" "ee" "ff" "gg" "hh" "ii" "jj"
regexpr
xx <- data.frame(Name = c("Item 1 (detail 1)",
"Item 20 (detail 20)",
"Item 300 (detail 300)"),
Item = NA,
Detail = NA)
xx$Detail <- substr(xx$Name, regexpr("\\(", xx$Name)+1, regexpr("\\)", xx$Name)-1)
xx$Item <- substr(xx$Name, 1, regexpr("\\(", xx$Name)-2)
xx## Name Item Detail
## 1 Item 1 (detail 1) Item 1 detail 1
## 2 Item 20 (detail 20) Item 20 detail 20
## 3 Item 300 (detail 300) Item 300 detail 300
Data Formats
Data can also be saved in many formats:
- numeric
- integer
- character
- factor
- logical
xd$x3 <- as.character(xd$x3)
xd$x3## [1] "1" "1" "1" "1" "1" "2" "2" "2" "3" "3"
xd$x3 <- as.numeric(xd$x3)
xd$x3## [1] 1 1 1 1 1 2 2 2 3 3
xd$x3 <- as.factor(xd$x3)
xd$x3## [1] 1 1 1 1 1 2 2 2 3 3
## Levels: 1 2 3
xd$x3 <- factor(xd$x3, levels = c("3","2","1"))
xd$x3## [1] 1 1 1 1 1 2 2 2 3 3
## Levels: 3 2 1
xd$x10## [1] TRUE TRUE TRUE FALSE FALSE TRUE TRUE FALSE FALSE FALSE
as.numeric(xd$x10) # TRUE = 1, FALSE = 0## [1] 1 1 1 0 0 1 1 0 0 0
sum(xd$x10)## [1] 5
Internal structure of an object can be checked with
str()
str(xc) # c()## num [1:10] 1 2 3 4 5 6 7 8 9 10
str(xm) # matrix()## int [1:10, 1:10] 1 2 3 4 5 6 7 8 9 10 ...
str(xd) # data.frame()## 'data.frame': 10 obs. of 10 variables:
## $ x1 : chr "aa" "bb" "cc" "dd" ...
## $ x2 : int 1 2 3 4 5 6 7 8 9 10
## $ x3 : Factor w/ 3 levels "3","2","1": 3 3 3 3 3 2 2 2 1 1
## $ x4 : num 1 2 1 2 1 2 1 2 1 2
## $ x5 : int 1 2 3 4 5 1 2 3 4 5
## $ x6 : int 1 1 2 2 3 3 4 4 5 5
## $ x7 : num 5 10 15 20 25 30 35 40 45 50
## $ x8 : num 0 0.301 0.477 0.602 0.699 ...
## $ x9 : num 1 8 27 64 125 216 343 512 729 1000
## $ x10: logi TRUE TRUE TRUE FALSE FALSE TRUE ...
str(xl) # list()## List of 3
## $ : num [1:10] 1 2 3 4 5 6 7 8 9 10
## $ : int [1:10, 1:10] 1 2 3 4 5 6 7 8 9 10 ...
## $ :'data.frame': 10 obs. of 10 variables:
## ..$ x1 : chr [1:10] "aa" "bb" "cc" "dd" ...
## ..$ x2 : int [1:10] 1 2 3 4 5 6 7 8 9 10
## ..$ x3 : num [1:10] 1 1 1 1 1 2 2 2 3 3
## ..$ x4 : num [1:10] 1 2 1 2 1 2 1 2 1 2
## ..$ x5 : int [1:10] 1 2 3 4 5 1 2 3 4 5
## ..$ x6 : int [1:10] 1 1 2 2 3 3 4 4 5 5
## ..$ x7 : num [1:10] 5 10 15 20 25 30 35 40 45 50
## ..$ x8 : num [1:10] 0 0.301 0.477 0.602 0.699 ...
## ..$ x9 : num [1:10] 1 8 27 64 125 216 343 512 729 1000
## ..$ x10: logi [1:10] TRUE TRUE TRUE FALSE FALSE TRUE ...
Packages
Additional libraries can be installed and loaded for use.
install.packages("scales")library(scales)
xx <- data.frame(Values = 1:10)
xx$Rescaled <- rescale(x = xx$Values, to = c(1,30))
xx## Values Rescaled
## 1 1 1.000000
## 2 2 4.222222
## 3 3 7.444444
## 4 4 10.666667
## 5 5 13.888889
## 6 6 17.111111
## 7 7 20.333333
## 8 8 23.555556
## 9 9 26.777778
## 10 10 30.000000
libraries can also be used without having to load them
scales::rescale(1:10, to = c(1,30))## [1] 1.000000 4.222222 7.444444 10.666667 13.888889 17.111111 20.333333 23.555556 26.777778 30.000000
Data Wrangling
R for Data Science - https://r4ds.had.co.nz/
xx <- data.frame(Group = c("X","X","Y","Y","Y","X","X","X","Y","Y"),
Data1 = 1:10,
Data2 = seq(10, 100, by = 10))
xx$NewData1 <- xx$Data1 + xx$Data2
xx$NewData2 <- xx$Data1 * 1000
xx## Group Data1 Data2 NewData1 NewData2
## 1 X 1 10 11 1000
## 2 X 2 20 22 2000
## 3 Y 3 30 33 3000
## 4 Y 4 40 44 4000
## 5 Y 5 50 55 5000
## 6 X 6 60 66 6000
## 7 X 7 70 77 7000
## 8 X 8 80 88 8000
## 9 Y 9 90 99 9000
## 10 Y 10 100 110 10000
xx$Data1 < 5 # which are less than 5## [1] TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
xx[xx$Data1 < 5,]## Group Data1 Data2 NewData1 NewData2
## 1 X 1 10 11 1000
## 2 X 2 20 22 2000
## 3 Y 3 30 33 3000
## 4 Y 4 40 44 4000
xx[xx$Group == "X", c("Group","Data2","NewData1")]## Group Data2 NewData1
## 1 X 10 11
## 2 X 20 22
## 6 X 60 66
## 7 X 70 77
## 8 X 80 88
Data wrangling with tidyverse and pipes
(%>%)
library(tidyverse) # install.packages("tidyverse")
xx <- data.frame(Group = c("X","X","Y","Y","Y","Y","Y","X","X","X")) %>%
mutate(Data1 = 1:10,
Data2 = seq(10, 100, by = 10),
NewData1 = Data1 + Data2,
NewData2 = Data1 * 1000)
xx## Group Data1 Data2 NewData1 NewData2
## 1 X 1 10 11 1000
## 2 X 2 20 22 2000
## 3 Y 3 30 33 3000
## 4 Y 4 40 44 4000
## 5 Y 5 50 55 5000
## 6 Y 6 60 66 6000
## 7 Y 7 70 77 7000
## 8 X 8 80 88 8000
## 9 X 9 90 99 9000
## 10 X 10 100 110 10000
filter(xx, Data1 < 5)## Group Data1 Data2 NewData1 NewData2
## 1 X 1 10 11 1000
## 2 X 2 20 22 2000
## 3 Y 3 30 33 3000
## 4 Y 4 40 44 4000
xx %>% filter(Data1 < 5)## Group Data1 Data2 NewData1 NewData2
## 1 X 1 10 11 1000
## 2 X 2 20 22 2000
## 3 Y 3 30 33 3000
## 4 Y 4 40 44 4000
xx %>% filter(Group == "X") %>%
select(Group, NewColName=Data2, NewData1)## Group NewColName NewData1
## 1 X 10 11
## 2 X 20 22
## 3 X 80 88
## 4 X 90 99
## 5 X 100 110
xs <- xx %>%
group_by(Group) %>%
summarise(Data2_mean = mean(Data2),
Data2_sd = sd(Data2),
NewData2_mean = mean(NewData2),
NewData2_sd = sd(NewData2))
xs## # A tibble: 2 × 5
## Group Data2_mean Data2_sd NewData2_mean NewData2_sd
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 X 60 41.8 6000 4183.
## 2 Y 50 15.8 5000 1581.
xx %>% left_join(xs, by = "Group")## Group Data1 Data2 NewData1 NewData2 Data2_mean Data2_sd NewData2_mean NewData2_sd
## 1 X 1 10 11 1000 60 41.83300 6000 4183.300
## 2 X 2 20 22 2000 60 41.83300 6000 4183.300
## 3 Y 3 30 33 3000 50 15.81139 5000 1581.139
## 4 Y 4 40 44 4000 50 15.81139 5000 1581.139
## 5 Y 5 50 55 5000 50 15.81139 5000 1581.139
## 6 Y 6 60 66 6000 50 15.81139 5000 1581.139
## 7 Y 7 70 77 7000 50 15.81139 5000 1581.139
## 8 X 8 80 88 8000 60 41.83300 6000 4183.300
## 9 X 9 90 99 9000 60 41.83300 6000 4183.300
## 10 X 10 100 110 10000 60 41.83300 6000 4183.300
Read/Write data
xx <- read.csv("data_r_tutorial.csv")
write.csv(xx, "data_r_tutorial.csv", row.names = F)For excel sheets, the package readxl can be used to read
in sheets of data.
library(readxl) # install.packages("readxl")
xx <- read_xlsx("data_r_tutorial.xlsx", sheet = "Data")Tidy Data
- Tutorial 1 - https://cran.r-project.org/web/packages/tidyr/vignettes/tidy-data.html
- Tutorial 2 - https://r4ds.had.co.nz/tidy-data.html
yy <- xx %>%
group_by(Name, Location) %>%
summarise(Mean_DTF = round(mean(DTF),1)) %>%
arrange(Location)
yy## # A tibble: 9 × 3
## # Groups: Name [3]
## Name Location Mean_DTF
## <chr> <chr> <dbl>
## 1 CDC Maxim AGL Jessore, Bangladesh 86.7
## 2 ILL 618 AGL Jessore, Bangladesh 79.3
## 3 Laird AGL Jessore, Bangladesh 76.8
## 4 CDC Maxim AGL Metaponto, Italy 134.
## 5 ILL 618 AGL Metaponto, Italy 138.
## 6 Laird AGL Metaponto, Italy 137.
## 7 CDC Maxim AGL Saskatoon, Canada 52.5
## 8 ILL 618 AGL Saskatoon, Canada 47
## 9 Laird AGL Saskatoon, Canada 56.8
yy <- yy %>% spread(key = Location, value = Mean_DTF)
yy## # A tibble: 3 × 4
## # Groups: Name [3]
## Name `Jessore, Bangladesh` `Metaponto, Italy` `Saskatoon, Canada`
## <chr> <dbl> <dbl> <dbl>
## 1 CDC Maxim AGL 86.7 134. 52.5
## 2 ILL 618 AGL 79.3 138. 47
## 3 Laird AGL 76.8 137. 56.8
yy <- yy %>% gather(key = TraitName, value = Value, 2:4)
yy## # A tibble: 9 × 3
## # Groups: Name [3]
## Name TraitName Value
## <chr> <chr> <dbl>
## 1 CDC Maxim AGL Jessore, Bangladesh 86.7
## 2 ILL 618 AGL Jessore, Bangladesh 79.3
## 3 Laird AGL Jessore, Bangladesh 76.8
## 4 CDC Maxim AGL Metaponto, Italy 134.
## 5 ILL 618 AGL Metaponto, Italy 138.
## 6 Laird AGL Metaponto, Italy 137.
## 7 CDC Maxim AGL Saskatoon, Canada 52.5
## 8 ILL 618 AGL Saskatoon, Canada 47
## 9 Laird AGL Saskatoon, Canada 56.8
yy <- yy %>% spread(key = Name, value = Value)
yy## # A tibble: 3 × 4
## TraitName `CDC Maxim AGL` `ILL 618 AGL` `Laird AGL`
## <chr> <dbl> <dbl> <dbl>
## 1 Jessore, Bangladesh 86.7 79.3 76.8
## 2 Metaponto, Italy 134. 138. 137.
## 3 Saskatoon, Canada 52.5 47 56.8
Base Plotting
We will start with some basic plotting using the base function
plot()
# A basic scatter plot
plot(x = xd$x8, y = xd$x9)# Adjust color and shape of the points
plot(x = xd$x8, y = xd$x9, col = "darkred", pch = 0)plot(x = xd$x8, y = xd$x9, col = xd$x4, pch = xd$x4)# Adjust plot type
plot(x = xd$x8, y = xd$x9, type = "line")# Adjust linetype
plot(x = xd$x8, y = xd$x9, type = "line", lty = 2)# Plot lines and points
plot(x = xd$x8, y = xd$x9, type = "both")Now lets create some random and normally distributed data to make some more complicated plots
# 100 random uniformly distributed numbers ranging from 0 - 100
ru <- runif(100, min = 0, max = 100)
ru## [1] 8.463383 54.139168 7.461632 23.802051 88.626144 37.470864 99.043417 74.903323 44.024084 53.272053 56.467480 99.666692
## [13] 59.446673 16.955454 64.631870 89.923305 36.343187 92.346770 90.035454 64.796573 42.263243 55.216484 70.062278 30.449396
## [25] 82.816755 65.032484 46.700755 19.052606 87.405043 64.103455 32.215668 74.425839 89.132146 46.106220 89.152510 89.945036
## [37] 59.343087 9.174369 76.128983 7.353001 97.377630 87.244911 82.715986 99.275667 41.605246 11.754936 81.243938 55.099300
## [49] 50.230504 94.183733 13.159754 34.688567 7.969982 15.382676 99.046164 40.478117 24.112145 28.891543 74.827315 43.203293
## [61] 61.430168 75.229284 65.272136 53.359983 10.898973 71.368186 11.453480 63.865923 39.220340 69.911651 86.290024 40.950687
## [73] 83.338173 82.389993 60.148235 45.333279 69.092387 71.417452 78.633691 95.350417 85.333785 88.224578 63.033702 37.209597
## [85] 75.684852 47.373712 81.024006 29.452758 31.445909 18.006039 63.161534 25.690089 17.217652 24.704244 45.328251 40.532662
## [97] 64.269429 98.490422 12.615358 94.554952
plot(x = ru)order(ru)## [1] 40 3 53 1 38 65 67 46 99 51 54 14 93 90 28 4 57 94 92 58 88 24 89 31 52 17 84 6 69 56 96
## [32] 72 45 21 60 9 95 76 34 27 86 49 10 64 2 48 22 11 37 13 75 61 83 91 68 30 97 15 20 26 63 77
## [63] 70 23 66 78 32 59 8 62 85 39 79 87 47 74 43 25 73 81 71 42 29 82 5 33 35 16 36 19 18 50 100
## [94] 80 41 98 7 55 44 12
ru<- ru[order(ru)]
ru## [1] 7.353001 7.461632 7.969982 8.463383 9.174369 10.898973 11.453480 11.754936 12.615358 13.159754 15.382676 16.955454
## [13] 17.217652 18.006039 19.052606 23.802051 24.112145 24.704244 25.690089 28.891543 29.452758 30.449396 31.445909 32.215668
## [25] 34.688567 36.343187 37.209597 37.470864 39.220340 40.478117 40.532662 40.950687 41.605246 42.263243 43.203293 44.024084
## [37] 45.328251 45.333279 46.106220 46.700755 47.373712 50.230504 53.272053 53.359983 54.139168 55.099300 55.216484 56.467480
## [49] 59.343087 59.446673 60.148235 61.430168 63.033702 63.161534 63.865923 64.103455 64.269429 64.631870 64.796573 65.032484
## [61] 65.272136 69.092387 69.911651 70.062278 71.368186 71.417452 74.425839 74.827315 74.903323 75.229284 75.684852 76.128983
## [73] 78.633691 81.024006 81.243938 82.389993 82.715986 82.816755 83.338173 85.333785 86.290024 87.244911 87.405043 88.224578
## [85] 88.626144 89.132146 89.152510 89.923305 89.945036 90.035454 92.346770 94.183733 94.554952 95.350417 97.377630 98.490422
## [97] 99.043417 99.046164 99.275667 99.666692
plot(x = ru)# 100 normally distributed numbers with a mean of 50 and sd of 10
nd <- rnorm(100, mean = 50, sd = 10)
nd## [1] 64.40363 35.15767 58.80549 61.73534 63.16287 67.63627 37.85798 40.56826 30.12871 49.05927 67.83681 41.93400 47.30832 61.03733
## [15] 51.10479 59.54784 44.26822 48.31049 42.79711 51.59422 40.35661 48.23641 48.67249 43.38192 59.59242 57.64648 54.52588 24.87011
## [29] 42.27818 56.10052 69.48550 47.72032 40.85586 55.94385 55.23585 48.15234 51.54309 37.18134 54.04333 38.40818 44.88821 58.67943
## [43] 36.94628 63.76810 38.92175 65.89515 41.63052 47.77811 58.00939 55.30768 39.75183 44.36097 52.18084 59.39967 31.45070 46.68177
## [57] 46.52008 50.99932 57.76799 65.16272 47.95292 45.98527 57.66679 60.50162 26.16225 67.24224 50.26373 54.48542 53.17396 57.81718
## [71] 46.78950 55.86336 47.87440 52.41916 27.03157 49.42392 46.34853 49.96996 69.71277 56.08718 46.90273 53.15389 63.00920 38.07127
## [85] 39.39763 60.18031 48.21745 42.47960 48.28244 64.29163 49.35019 36.48575 33.13188 36.53864 64.08224 45.03859 72.10061 46.49604
## [99] 48.34971 54.60023
nd <- nd[order(nd)]
nd## [1] 24.87011 26.16225 27.03157 30.12871 31.45070 33.13188 35.15767 36.48575 36.53864 36.94628 37.18134 37.85798 38.07127 38.40818
## [15] 38.92175 39.39763 39.75183 40.35661 40.56826 40.85586 41.63052 41.93400 42.27818 42.47960 42.79711 43.38192 44.26822 44.36097
## [29] 44.88821 45.03859 45.98527 46.34853 46.49604 46.52008 46.68177 46.78950 46.90273 47.30832 47.72032 47.77811 47.87440 47.95292
## [43] 48.15234 48.21745 48.23641 48.28244 48.31049 48.34971 48.67249 49.05927 49.35019 49.42392 49.96996 50.26373 50.99932 51.10479
## [57] 51.54309 51.59422 52.18084 52.41916 53.15389 53.17396 54.04333 54.48542 54.52588 54.60023 55.23585 55.30768 55.86336 55.94385
## [71] 56.08718 56.10052 57.64648 57.66679 57.76799 57.81718 58.00939 58.67943 58.80549 59.39967 59.54784 59.59242 60.18031 60.50162
## [85] 61.03733 61.73534 63.00920 63.16287 63.76810 64.08224 64.29163 64.40363 65.16272 65.89515 67.24224 67.63627 67.83681 69.48550
## [99] 69.71277 72.10061
plot(x = nd)hist(x = nd)hist(nd, breaks = 20, col = "darkgreen")plot(x = density(nd))boxplot(x = nd)boxplot(x = nd, horizontal = T)ggplot2
Lets be honest, the base plots are ugly! The ggplot2
package gives the user to create a better, more visually appealing
plots. Additional packages such as ggbeeswarm and
ggrepel also contain useful functions to add to the
functionality of ggplot2.
- ggplot2 - https://ggplot2.tidyverse.org/
- Tutorial 1 - http://r-statistics.co/ggplot2-Tutorial-With-R.html
- Tutorial 2 - https://www.statsandr.com/blog/graphics-in-r-with-ggplot2/
- The R Graph Gallery - https://www.r-graph-gallery.com/ggplot2-package.html
library(ggplot2)
mp <- ggplot(xd, aes(x = x8, y = x9))
mp + geom_point()mp + geom_point(aes(color = x3, shape = x3), size = 4)mp + geom_line(size = 2)mp + geom_line(aes(color = x3), size = 2)mp + geom_smooth(method = "loess")mp + geom_smooth(method = "lm")xx <- data.frame(data = c(rnorm(50, mean = 40, sd = 10),
rnorm(50, mean = 60, sd = 5)),
group = factor(rep(1:2, each = 50)),
label = c("Label1", rep(NA, 49), "Label2", rep(NA, 49)))
mp <- ggplot(xx, aes(x = data, fill = group))
mp + geom_histogram(color = "black")mp + geom_histogram(color = "black", position = "dodge")mp1 <- mp + geom_histogram(color = "black") + facet_grid(group~.)
mp1mp + geom_density(alpha = 0.5)mp <- ggplot(xx, aes(x = group, y = data, fill = group))
mp + geom_boxplot(color = "black")mp + geom_boxplot() + geom_point()mp + geom_violin() + geom_boxplot(width = 0.1, fill = "white")library(ggbeeswarm)
mp + geom_quasirandom()mp + geom_quasirandom(aes(shape = group))mp2 <- mp + geom_violin() +
geom_boxplot(width = 0.1, fill = "white") +
geom_beeswarm(alpha = 0.5)
library(ggrepel)
mp2 + geom_text_repel(aes(label = label), nudge_x = 0.4)library(ggpubr)
ggarrange(mp1, mp2, ncol = 2, widths = c(2,1),
common.legend = T, legend = "bottom")Statistics
- Handbook of Biological Statistics - http://biostathandbook.com/
- R Companion for ^ - https://rcompanion.org/rcompanion/a_02.html
# Prep data
lev_Loc <- c("Saskatoon, Canada", "Jessore, Bangladesh", "Metaponto, Italy")
lev_Name <- c("ILL 618 AGL", "CDC Maxim AGL", "Laird AGL")
dd <- read_xlsx("data_r_tutorial.xlsx", sheet = "Data") %>%
mutate(Location = factor(Location, levels = lev_Loc),
Name = factor(Name, levels = lev_Name))
xx <- dd %>%
group_by(Name, Location) %>%
summarise(Mean_DTF = mean(DTF))
xx %>% spread(Location, Mean_DTF)## # A tibble: 3 × 4
## # Groups: Name [3]
## Name `Saskatoon, Canada` `Jessore, Bangladesh` `Metaponto, Italy`
## <fct> <dbl> <dbl> <dbl>
## 1 ILL 618 AGL 47 79.3 138.
## 2 CDC Maxim AGL 52.5 86.7 134.
## 3 Laird AGL 56.8 76.8 137.
# Plot
mp1 <- ggplot(dd, aes(x = Location, y = DTF, color = Name, shape = Name)) +
geom_point(size = 2, alpha = 0.7, position = position_dodge(width=0.5))
mp2 <- ggplot(xx, aes(x = Location, y = Mean_DTF,
color = Name, group = Name, shape = Name)) +
geom_point(size = 2.5, alpha = 0.7) +
geom_line(size = 1, alpha = 0.7) +
theme(legend.position = "top")
ggarrange(mp1, mp2, ncol = 2, common.legend = T, legend = "top")From first glace, it is clear there are differences between genotypes, locations, and genotype x environment (GxE) interactions. Now let’s do a few statistical tests.
summary(aov(DTF ~ Name * Location, data = dd))## Df Sum Sq Mean Sq F value Pr(>F)
## Name 2 88 44 3.476 0.0395 *
## Location 2 65863 32931 2598.336 < 2e-16 ***
## Name:Location 4 560 140 11.044 2.52e-06 ***
## Residuals 45 570 13
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
As expected, an ANOVA shows statistical significance for genotype (p-value = 0.0395), Location (p-value < 2e-16) and GxE interactions (p-value < 2.52e-06). However, all this tells us is that one genotype is different from the rest, one location is different from the others and that there is GxE interactions. If we want to be more specific, would need to do some multiple comparison tests.
If we only have two things to compare, we could do a t-test.
xx <- dd %>%
filter(Location %in% c("Saskatoon, Canada", "Jessore, Bangladesh")) %>%
spread(Location, DTF)
t.test(x = xx$`Saskatoon, Canada`, y = xx$`Jessore, Bangladesh`)##
## Welch Two Sample t-test
##
## data: xx$`Saskatoon, Canada` and xx$`Jessore, Bangladesh`
## t = -17.521, df = 32.701, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -32.18265 -25.48402
## sample estimates:
## mean of x mean of y
## 52.11111 80.94444
DTF in Saskatoon, Canada is significantly different (p-value < 2.2e-16) from DTF in Jessore, Bangladesh.
xx <- dd %>%
filter(Name %in% c("ILL 618 AGL", "Laird AGL"),
Location == "Metaponto, Italy") %>%
spread(Name, DTF)
t.test(x = xx$`ILL 618 AGL`, y = xx$`Laird AGL`)##
## Welch Two Sample t-test
##
## data: xx$`ILL 618 AGL` and xx$`Laird AGL`
## t = 0.38008, df = 8.0564, p-value = 0.7137
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -5.059739 7.059739
## sample estimates:
## mean of x mean of y
## 137.8333 136.8333
DTF between ILL 618 AGL and Laird AGL are not significantly different (p-value = 0.7137) in Metaponto, Italy.
pch Plot
xx <- data.frame(x = rep(1:6, times = 5, length.out = 26),
y = rep(5:1, each = 6, length.out = 26),
pch = 0:25)
mp <- ggplot(xx, aes(x = x, y = y, shape = as.factor(pch))) +
geom_point(color = "darkred", fill = "darkblue", size = 5) +
geom_text(aes(label = pch), nudge_x = -0.25) +
scale_shape_manual(values = xx$pch) +
scale_x_continuous(breaks = 6:1) +
scale_y_continuous(breaks = 6:1) +
theme_void() +
theme(legend.position = "none",
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
axis.text = element_blank(),
axis.ticks = element_blank()) +
labs(title = "Plot symbols in R (pch)",
subtitle = "color = \"darkred\", fill = \"darkblue\"",
x = NULL, y = NULL)
ggsave("pch.png", mp, width = 4.5, height = 3, bg = "white")R Markdown
Tutorials on how to create an R markdown document like this one can be found here: